import the data

setwd("/Users/ruizhipu/Desktop")
df = read.csv("stock_price.csv",header = TRUE)
df = t(df)

dimension of the data

dim(df)
## [1]  30 127

TASK 1

Use PCA to reduce the dimension of stock-price information. Generate a screeplot and determine the number of principle components based on this plot. Plot the loadings for first principal component.

pcastock = prcomp(df,scale= TRUE)
#pcastock
summary(pcastock)
## Importance of components:
##                            PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     10.3315 2.4535 1.7092 1.5615 1.36449 1.10036 1.0391
## Proportion of Variance  0.8405 0.0474 0.0230 0.0192 0.01466 0.00953 0.0085
## Cumulative Proportion   0.8405 0.8879 0.9109 0.9301 0.94472 0.95426 0.9628
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.93168 0.83322 0.76973 0.69701 0.63281 0.57939
## Proportion of Variance 0.00683 0.00547 0.00467 0.00383 0.00315 0.00264
## Cumulative Proportion  0.96959 0.97506 0.97973 0.98355 0.98670 0.98935
##                           PC14    PC15    PC16    PC17    PC18    PC19
## Standard deviation     0.56076 0.52209 0.44984 0.36731 0.34632 0.33101
## Proportion of Variance 0.00248 0.00215 0.00159 0.00106 0.00094 0.00086
## Cumulative Proportion  0.99182 0.99397 0.99556 0.99663 0.99757 0.99843
##                           PC20   PC21   PC22    PC23    PC24    PC25
## Standard deviation     0.27238 0.2256 0.1604 0.13618 0.12342 0.09085
## Proportion of Variance 0.00058 0.0004 0.0002 0.00015 0.00012 0.00006
## Cumulative Proportion  0.99902 0.9994 0.9996 0.99977 0.99989 0.99995
##                           PC26    PC27    PC28    PC29      PC30
## Standard deviation     0.06140 0.03348 0.02853 0.02202 1.046e-15
## Proportion of Variance 0.00003 0.00001 0.00001 0.00000 0.000e+00
## Cumulative Proportion  0.99998 0.99999 1.00000 1.00000 1.000e+00
#the loadings
#pcastock$rotation
stockpc = predict(pcastock)
plot(pcastock, main="screenplot")
mtext(side=1, "Stock Price Principal Components",  line=1, font=2)

So, maybe we can choose the PC1 as the principle component

plot(x = c(1:30),y = stockpc[,1], xlab = "index of the stock", ylab = "loadings of the PC1",main = "loadings for first principal componen")

Generate a scatterplot to project stocks on the first two principal components.

plot(stockpc[,1:2], type="n",main = "first two principal components")
text(x=stockpc[,1], y=stockpc[,2], labels=row.names(df))

##Generate an MDS map to plot stocks on a two-dimensional space.

#set.seed(851982)
stock.dis = dist(df)
stock.mds <- cmdscale(stock.dis)
plot(stock.mds,type = 'n',main = "MDS map")
text(stock.mds, labels=row.names(df))

##Use k-means and hierarchical clustering to group stocks. Specifically, you will generate 8 MDS maps for the stocks and color the stocks based on different clustering methods (k-means, h-clustering with single-link, h-clustering with complete-link, h-clustering with average-link) and different number of clusters (k = 3, k = 6). For each hierarchical clustering method, generate a dendrogram. ##hint: Standardize the data before performing clustering

k-means k = 3

set.seed(1)
knnstock = kmeans(df, centers=3, nstart=10)
#knnstock
o=order(knnstock$cluster)
dfk3 = data.frame(row.names(df)[o],knnstock$cluster[o])
#dfk3
plot(stock.mds,type = 'n',main = "k-means k = 3")
text(stock.mds, labels=row.names(df),col = knnstock$cluster+1)

k-means k = 6

set.seed(1)
knnstock = kmeans(df, centers=6, nstart=10)
#knnstock
o=order(knnstock$cluster)
dfk6 = data.frame(row.names(df)[o],knnstock$cluster[o])
#dfk6
plot(stock.mds,type = 'n',main = "k-means k = 6")
text(stock.mds, labels=row.names(df),col = knnstock$cluster+1)

library(cluster)

k = 3

h-clustering with single-link

df2 = scale(df)
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "single")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "single") 
## Agglomerative coefficient:  0.7523622 
## Order of objects:
##  [1] AA   BAC  GE   PFE  INTC KO   T    MSFT CSCO VZ   MRK  HD   DIS  JPM 
## [15] AXP  JNJ  PG   TRV  WMT  KRFT MCD  DD   HPQ  BA   MMM  UTX  CVX  XOM 
## [29] CAT  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   4.443   7.091   7.857   7.971  30.602 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='single')
hcstock1 = cutree(hcstock,k=3)
print(hcstock1)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    1    1    1    2    1    2    1    1    1    1    1    3    1    1 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1

dendrogram single-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock1+1)

h-clustering with complete-link

stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "complete")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "complete") 
## Agglomerative coefficient:  0.8528047 
## Order of objects:
##  [1] AA   BAC  GE   PFE  CSCO INTC MSFT AXP  DIS  JPM  HD   KO   T    VZ  
## [15] MRK  HPQ  DD   TRV  WMT  BA   JNJ  PG   KRFT MCD  MMM  UTX  CAT  CVX 
## [29] XOM  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   5.926   7.848  12.116  11.863  61.404 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='complete')
hcstock = cutree(hcstock,k=3)
print(hcstock)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    2    2    1    2    1    2    2    2    1    1    2    3    1    2 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    2    2    1    2    2    1    1    1    2    1    2    2    1    2    2

dendrogram complete-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)

h-clustering with average-link

stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "average")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "average") 
## Agglomerative coefficient:  0.8142313 
## Order of objects:
##  [1] AA   BAC  CSCO GE   PFE  INTC MSFT HD   KO   T    VZ   MRK  AXP  DIS 
## [15] JPM  DD   TRV  WMT  HPQ  BA   MMM  UTX  JNJ  PG   KRFT MCD  CVX  XOM 
## [29] CAT  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   4.856   7.676  10.128  11.166  47.382 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='average')
hcstock = cutree(hcstock,k=3)
print(hcstock)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    1    2    1    2    1    2    2    1    1    1    1    3    1    2 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    1    2    1    2    2    1    1    1    2    1    2    2    1    2    2

dendrogram average-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)

##k = 6

h-clustering with single-link

df2 = df
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "single")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "single") 
## Agglomerative coefficient:  0.7523622 
## Order of objects:
##  [1] AA   BAC  GE   PFE  INTC KO   T    MSFT CSCO VZ   MRK  HD   DIS  JPM 
## [15] AXP  JNJ  PG   TRV  WMT  KRFT MCD  DD   HPQ  BA   MMM  UTX  CVX  XOM 
## [29] CAT  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   4.443   7.091   7.857   7.971  30.602 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='single')
hcstock = cutree(hcstock,k=6)
print(hcstock)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    1    2    1    3    1    3    4    1    1    1    1    5    1    4 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    1    4    1    2    6    1    1    1    4    1    4    2    1    4    2

dendrogram single-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)

h-clustering with complete-link

stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "complete")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "complete") 
## Agglomerative coefficient:  0.8528047 
## Order of objects:
##  [1] AA   BAC  GE   PFE  CSCO INTC MSFT AXP  DIS  JPM  HD   KO   T    VZ  
## [15] MRK  HPQ  DD   TRV  WMT  BA   JNJ  PG   KRFT MCD  MMM  UTX  CAT  CVX 
## [29] XOM  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   5.926   7.848  12.116  11.863  61.404 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='complete')
hcstock = cutree(hcstock,k=6)
print(hcstock)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    2    3    1    4    1    4    2    2    1    5    2    6    1    3 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    2    3    5    3    4    5    5    1    3    5    3    4    5    2    4

dendrogram complete-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)

h-clustering with average-link

stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "average")
stock2agg
## Call:     agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE,      method = "average") 
## Agglomerative coefficient:  0.8142313 
## Order of objects:
##  [1] AA   BAC  CSCO GE   PFE  INTC MSFT HD   KO   T    VZ   MRK  AXP  DIS 
## [15] JPM  DD   TRV  WMT  HPQ  BA   MMM  UTX  JNJ  PG   KRFT MCD  CVX  XOM 
## [29] CAT  IBM 
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.656   4.856   7.676  10.128  11.166  47.382 
## 
## Available components:
## [1] "order"     "height"    "ac"        "merge"     "diss"      "call"     
## [7] "method"    "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='average')
hcstock = cutree(hcstock,k=6)
print(hcstock)
##   AA  AXP   BA  BAC  CAT CSCO  CVX   DD  DIS   GE   HD  HPQ  IBM INTC  JNJ 
##    1    2    3    1    4    1    4    5    2    1    2    2    6    1    5 
##  JPM KRFT   KO  MCD  MMM  MRK MSFT  PFE   PG    T  TRV  UTX   VZ  WMT  XOM 
##    2    5    2    3    4    2    2    1    5    2    5    3    2    5    3

dendrogram average-link

plot(stock2agg, which.plots=2)

MDS plot

plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)

TASK 2

library(foreign)
dfvt = read.dta("sen113kh.dta")
df10086 = subset(dfvt, state < 99)
dfvote = df10086[,-c(1:9)]

Create a senator-by-senator distance matrix for the 113th Congress. Generate an MDS plot to project the senators on the two dimensional space. Use shapes or colors to differentiate the senators’ party affliation

the distance matrix is ex.vote

ex.vote = as.matrix(dfvote) %*% t(dfvote)
#ex.dist = dist(ex.vote)
#ex.mds = cmdscale(ex.dist)
#plot(ex.mds, type = 'n')
#text(ex.mds,dt[,9],col = dt[,6]+1)
  no.pres <- subset(dfvt, state < 99)
  ## to group all Yea and Nay types together
  for(i in 10:ncol(no.pres)) {
    no.pres[,i] = ifelse(no.pres[,i] > 6, 0, no.pres[,i])
    no.pres[,i] = ifelse(no.pres[,i] > 0 & no.pres[,i] < 4, 1, no.pres[,i])
    no.pres[,i] = ifelse(no.pres[,i] > 1, -1, no.pres[,i])
  }
  dt <- as.matrix(no.pres[,10:ncol(no.pres)])
library(ggplot2)
rollcall.dist = dist(dt)
rollcall.mds = cmdscale(rollcall.dist)
congress = subset(dfvt, state < 99)
congress.names = sapply(as.character(congress$name), function(n) strsplit(n, "[, ]")[[1]][1])

rollcall.mds = transform(rollcall.mds,
                name = congress.names,
                party = as.factor(congress$party))

mds = ggplot(rollcall.mds, aes(x = X1, y = X2)) +
                scale_alpha(guide="none") + theme_bw() +
                theme(axis.ticks = element_blank(),
                axis.text.x = element_blank(),
                axis.text.y = element_blank()) +
                xlab("") +
                ylab("") +
                scale_shape(name = "party", breaks = c("100", "200", "328"),
                            labels = c("Dem.", "Rep.", "Ind."), solid = FALSE) +
                scale_color_manual(name = "party", values = c("100" = "blue",
                                                              "200" = "red",
                                                              "328"="green"),
                                                              breaks = c("100", "200", "328"),
                                                              labels = c("Dem.", "Rep.", "Ind.")
                                   )
print(mds + geom_point(aes(shape = party,
                           color = party,
                                alpha = 0.75),size=3))

ex.mds = rollcall.mds
print(mds + geom_text(aes(color = party,
                          alpha = 0.75,
                          label = rollcall.mds$name),size=3))

Compute the purity and entropy for these clustering results with respect to the senators’ party labels. You will generate a 2x4 table

cluster2 = as.vector(knnvote$cluster)
class = rollcall.mds$party
kmp = cluster.purity(cluster2,class)
kme = cluster.entropy(cluster2,class)

cluster2 = as.vector(hccut1)
sp = cluster.purity(cluster2,class)
se = cluster.entropy(cluster2,class)


cluster2 = as.vector(hccut2)
ce = cluster.purity(cluster2,class)
cp = cluster.entropy(cluster2,class)

cluster2 = as.vector(hccut3)
ae = cluster.purity(cluster2,class)
ap = cluster.entropy(cluster2,class)

table = matrix(c(kmp,kme,sp,se,ce,cp,ae,ap),ncol= 4,byrow = FALSE)
colnames(table) <- c("k-means","hclust-single","hclust-complete","hclust-average")
rownames(table) <- c("Purity","Entropy")
table <- as.table(table)
table
##           k-means hclust-single hclust-complete hclust-average
## Purity  0.9428571     0.5619048       0.9523810      0.9523810
## Entropy 0.3520954     1.0859032       0.2850529      0.2850529

Based on your observation on both measures and mis-classified members, choose two clustering methods that generate the most meaningful results and explain why.

H-clustering with complete-link and h-clust-average are the best two that generate the most meaningful results, because we can see from the picture that they have the least mis-classified results and also , they have the high Purity (h-clust-complet&kmeans) and the low Entropy(k-means) value. Maybe the pair-wise distances between the data points is a better choice for the distance calculation for the clusters and data are connected in pairs.And thus also kmeans may not be able to form the best clusters, the wrong assigned data in the single link model may have “chain effects of forming the cluster because of it is sensitive to noise in the data.